/* XXL: The eXtensible and fleXible Library for data processing

Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
                        Head of the Database Research Group
                        Department of Mathematics and Computer Science
                        University of Marburg
                        Germany

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library;  If not, see <http://www.gnu.org/licenses/>. 

    http://code.google.com/p/xxl/

*/

package xxl.core.cursors.distincts;

import java.util.Iterator;

import xxl.core.collections.bags.Bag;
import xxl.core.collections.bags.ListBag;
import xxl.core.collections.queues.ArrayQueue;
import xxl.core.collections.queues.Queue;
import xxl.core.cursors.AbstractCursor;
import xxl.core.cursors.Cursor;
import xxl.core.cursors.Cursors;
import xxl.core.cursors.wrappers.QueueCursor;
import xxl.core.functions.AbstractFunction;
import xxl.core.functions.Function;
import xxl.core.predicates.Equal;
import xxl.core.predicates.Predicate;
import xxl.core.predicates.RightBind;

/**
 * A nested-loops implementation of the distinct operator, i.e., all duplicates
 * contained in a cursor will be removed. Depending on the specified memory
 * size and object size as many elements of the input cursor as possible will
 * be inserted into a temporal bag (typically located in main memory). To
 * guarantee that no duplicates will be inserted into it, the bag is searched
 * for duplicates with the help of a user defined predicate. If not all
 * elements can be inserted into the bag they will be temporarily stored in a
 * queue, that is typically resided in external memory, and will be inserted
 * when the bag has been emptied due to calls to the <code>next</code> method
 * of this class.
 * 
 * <p><b>Example usage (1):</b>
 * <code><pre>
 *   NestedLoopsDistinct&lt;Integer&gt; distinct = new NestedLoopsDistinct&lt;Integer&gt;(
 *       new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(21), 30),
 *       32,
 *       4
 *   );
 *   
 *   distinct.open();
 *   
 *   while(distinct.hasNext())
 *       System.out.println(distinct.next());
 *       
 *   distinct.close();
 * </pre></code>
 * The input cursor shown in this example deliveres 30 randomly distributed
 * integer numbers contained in the interval [0, 20]. The main memory size is
 * set to 32 bytes and the object size to 4 bytes. So the
 * {@link ListBag list-bag} generated by the default factory method that is
 * used in this case as default is able to hold a maximum 3 elements in main
 * memory. The remaining one element, that fits in main memory, is reserved for
 * the comparison of two elements. Due to the fact that not all elements of the
 * input cursor can be stored in the temporal main memory bag, the remaining
 * elements will be stored in an {@link ArrayQueue array-queue} returned by the
 * factory method that is used as default. Normally this queue should be placed
 * in external memory. Running this example shows that no duplicates are
 * returned by this operator.</p>
 * 
 * <p><b>Note:</b> If an input iteration is given by an object of the class
 * {@link Iterator}, i.e., it does not support the <code>peek</code> operation,
 * it is internally wrapped to a cursor.</p>
 *
 * @param <E> the type of the elements returned by this distinct operator.
 * @see java.util.Iterator
 * @see xxl.core.cursors.Cursor
 * @see xxl.core.collections.bags.Bag
 * @see xxl.core.collections.queues.Queue
 * @see xxl.core.cursors.distincts.SortBasedDistinct
 * @see xxl.core.relational.cursors.NestedLoopsDistinct
 */
public class NestedLoopsDistinct<E> extends AbstractCursor<E> {

	/**
	 * The input cursor delivering the elements for the distinct operation.
	 */
	protected Cursor<? extends E> cursor;

	/**
	 * The result cursor delivering only distinct elements of the input cursor.
	 */
	protected Cursor<E> results = null;

	/**
	 * A queue used to store the remaining elements during the operation if not
	 * all elements can be stored in the temporal main memory bag.
	 */
	protected Queue<E> remainder = null;

	/**
	 * A parameterless function returning an empty bag on demand.
	 */
	protected Function<?, ? extends Bag<E>> newBag;

	/**
	 * A parameterless function returning an empty queue on demand.
	 */
	protected Function<?, ? extends Queue<E>> newQueue;

	/**
	 * An unary predicate determining if two elements are equal. If the
	 * returned value is <code>true</code> the given element and the next
	 * element of the internal used cursor that holds all remaining elements do
	 * match.
	 */
	protected Predicate<? super E> predicate;

	/**
	 * The maximum number of elements that can be stored in the bag returned by
	 * the function <code>newBag</code>.
	 */
	protected int maxTuples;

	/**
	 * An internal used flag signaling if the elements inserted into the bag
	 * are delivered from the input cursor or the queue <code>remainder</code>.
	 */
	protected boolean initialized = false;

	/**
	 * Creates a new instance of the nested-loops distinct operator. The input
	 * iterator is wrapped to a cursor. Determines the maximum number of
	 * elements that can be stored in the bag used for the temporal storage of
	 * the elements of the input cursor:
	 * <pre>
	 *     maxTuples = memSize / objectSize - 1
	 * </pre>
	 *
	 * @param input the input iterator delivering the elements.
	 * @param memSize the maximum amount of available main memory (bytes) for
	 *        the bag.
	 * @param objectSize the size (bytes) needed to store one element.
	 * @param predicate the binary predicate returning <code>true</code> if two
	 *        elements are equal.
	 * @param newBag a parameterless function returning an empty bag.
	 * @param newQueue a parameterless function returning an empty queue.
	 * @throws IllegalArgumentException if not enough main memory is available.
	 */
	public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize, Predicate<? super E> predicate, Function<?, ? extends Bag<E>> newBag, Function<?, ? extends Queue<E>> newQueue) throws IllegalArgumentException {
		this.cursor = Cursors.wrap(input);
		this.newBag = newBag;
		this.newQueue = newQueue;
		this.predicate = predicate;
		this.maxTuples = memSize / objectSize - 1;
		if (memSize < 2*objectSize)
			throw new IllegalArgumentException("insufficient main memory available.");
	}

	/**
	 * Creates a new instance of the nested-loops distinct operator. The input
	 * iterator is wrapped to a cursor. Determines the maximum number of
	 * elements that can be stored in the bag used for the temporal storage of
	 * the elements of the input cursor:
	 * <pre>
	 *     maxTuples = memSize / objectSize - 1
	 * </pre>
	 * Uses default factory methods for list-bags and array-queues. Determines
	 * the equality between two elements with the help of the default instance
	 * of the predicate {@link Equal}.
	 *
	 * @param input the input iterator delivering the elements.
	 * @param memSize the maximum amount of available main memory (bytes) for
	 *        the bag.
	 * @param objectSize the size (bytes) needed to store one element.
	 * @throws IllegalArgumentException if not enough main memory is available.
	 */
	public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize) throws IllegalArgumentException {
		this(
			input,
			memSize,
			objectSize,
			Equal.DEFAULT_INSTANCE,
			new AbstractFunction<Object, ListBag<E>>() {
				public ListBag<E> invoke() {
					return new ListBag<E>();
				}
			},
			new AbstractFunction<Object, ArrayQueue<E>>() {
				public ArrayQueue<E> invoke() {
					return new ArrayQueue<E>();
				}
			}
		);
	}

	/**
	 * Opens the nested-loops distinct operator, i.e., signals the cursor to
	 * reserve resources, open the input iteration, etc. Before a cursor has
	 * been opened calls to methods like <code>next</code> or <code>peek</code>
	 * are not guaranteed to yield proper results. Therefore <code>open</code>
	 * must be called before a cursor's data can be processed. Multiple calls
	 * to <code>open</code> do not have any effect, i.e., if <code>open</code>
	 * was called the cursor remains in the state <i>opened</i> until its
	 * <code>close</code> method is called.
	 * 
	 * <p>Note, that a call to the <code>open</code> method of a closed cursor
	 * usually does not open it again because of the fact that its state
	 * generally cannot be restored when resources are released respectively
	 * files are closed.</p>
	 */
	public void open() {
		if (isOpened)
			return;
		super.open();
		cursor.open();
	}
	
	/**
	 * Closes the nested-loops distinct operator. Signals the operator to clean
	 * up resources, close the input cursor as well as the internally used bag
	 * and queue. After a call to <code>close()</code> calls to methods like
	 * <code>next</code> or <code>peek</code> are not guaranteed to yield
	 * proper results. Multiple calls to <code>close</code> do not have any
	 * effect, i.e., if <code>close</code> was called the nested-loops distinct
	 * operator remains in the state "closed".
	 */
	public void close() {
		if (isClosed)
			return;
		super.close();
		if (remainder != null)
			remainder.close();
		cursor.close();
		results.close();
	}

	/**
	 * Returns <code>true</code> if the iteration has more elements. (In other
	 * words, returns <code>true</code> if <code>next</code> or
	 * <code>peek</code> would return an element rather than throwing an
	 * exception.)
	 * 
	 * <p>Builds a temporal bag calling <code>newBag.invoke()</code> and stores
	 * as much distinct elements of the input cursor in this bag as possible.
	 * After that all remaining elements of the input cursor are inserted in
	 * the queue <code>remainder</code>. With the intention to guarantee that
	 * no duplicate elements are inserted in the temporal bag the bag's
	 * <code>query</code> method verifying each element concerning the
	 * specified predicate is called.<br />
	 * An element is only inserted if the result of the <code>query</code>
	 * method (a cursor) is empty. At last the bag's <code>cursor</code> method
	 * is called and the result cursor's reference is set to this cursor. If
	 * the result cursor contains any elements, <code>true</code> is returned,
	 * otherwise <code>false</code>. If the queue <code>remainder</code>
	 * contains further elements the whole procedure is returned by the next
	 * call to this method and at this time the elements inserted in the
	 * temporal bag are delivered by the queue.
	 *
	 * @return <code>true</code> if the nested-loops distinct operator has more
	 *         elements.
	 */
	protected boolean hasNextObject() {
		if (results == null || !results.hasNext()) {
			Cursor<? extends E> input;
			if (initialized)
				input = new QueueCursor<E>(remainder);
			else
				input = cursor;
			Bag<E> tmpBag = newBag.invoke();
			int counter = 0;
			if (initialized && remainder != null)
				counter = remainder.size();
			while((!initialized && input.hasNext()) || (initialized && counter-- > 0)) {
				E next = input.next();
				if (!tmpBag.query(new RightBind<E>(predicate, next)).hasNext())
					if (tmpBag.size() < maxTuples)
						tmpBag.insert(next);
					else {
						if (remainder == null)
							(remainder = newQueue.invoke()).open();
						remainder.enqueue(next);
					}
			}
			initialized = true;
			results = tmpBag.cursor();
			return results.hasNext();
		}
		return true;
	}

	/**
	 * Returns the next element in the iteration. This element will be removed
	 * from the iteration, if <code>next</code> is called.
	 *
	 * @return the next element in the iteration.
	 */
	protected E nextObject() {
		E result = results.next();
		results.remove();
		return result;
	}

	/**
	 * Resets the nested-loops distinct operator to its initial state (optional
	 * operation). So the caller is able to traverse the underlying data
	 * structure again. The modifications, removes and updates concerning the
	 * underlying data structure, are still persistent. This method resets the
	 * input iteration, closes the result cursor, sets it to <code>null</code>
	 * and clears the queue <code>remainder</code>.
	 *
	 * @throws UnsupportedOperationException if the <code>reset</code>
	 *         operation is not supported by the nested-loops distinct
	 *         operator.
	 */
	public void reset() throws UnsupportedOperationException {
		super.reset();
		if (remainder != null)
			remainder.clear();
		cursor.reset();
		results.close();
		results = null;
		initialized = false;
	}
	
	/**
	 * Returns <code>true</code> if the <code>reset</code> operation is
	 * supported by the nested-loops distinct operator. Otherwise it returns
	 * <code>false</code>.
	 *
	 * @return <code>true</code> if the <code>reset</code> operation is
	 *         supported by the cursor, otherwise <code>false</code>.
	 */
	public boolean supportsReset() {
		return cursor.supportsReset();
	}
}
